{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "938cacb9",
   "metadata": {},
   "source": [
    "## 数据准备 Sample"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8f05db70",
   "metadata": {},
   "source": [
    "(数据准备/数据采样/数据获取/数据采集)\n",
    "\n",
    "通过使用一个或多个数据表提取和准备用于模型构建的数据样本来对数据进行采样。\n",
    "\n",
    "采样包括定义或子集数据行的操作。样本应足够大，以有效地包含重要信息。\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "4071adcd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>LIMIT_BAL</th>\n",
       "      <th>SEX</th>\n",
       "      <th>EDUCATION</th>\n",
       "      <th>MARRIAGE</th>\n",
       "      <th>AGE</th>\n",
       "      <th>PAY_0</th>\n",
       "      <th>PAY_2</th>\n",
       "      <th>PAY_3</th>\n",
       "      <th>PAY_4</th>\n",
       "      <th>PAY_5</th>\n",
       "      <th>PAY_6</th>\n",
       "      <th>BILL_AMT1</th>\n",
       "      <th>BILL_AMT2</th>\n",
       "      <th>BILL_AMT3</th>\n",
       "      <th>BILL_AMT4</th>\n",
       "      <th>BILL_AMT5</th>\n",
       "      <th>BILL_AMT6</th>\n",
       "      <th>PAY_AMT1</th>\n",
       "      <th>PAY_AMT2</th>\n",
       "      <th>PAY_AMT3</th>\n",
       "      <th>PAY_AMT4</th>\n",
       "      <th>PAY_AMT5</th>\n",
       "      <th>PAY_AMT6</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>20000.00</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>24</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>-1</td>\n",
       "      <td>-1</td>\n",
       "      <td>-2</td>\n",
       "      <td>-2</td>\n",
       "      <td>3913.00</td>\n",
       "      <td>3102.00</td>\n",
       "      <td>689.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>689.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>120000.00</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>26</td>\n",
       "      <td>-1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2682.00</td>\n",
       "      <td>1725.00</td>\n",
       "      <td>2682.00</td>\n",
       "      <td>3272.00</td>\n",
       "      <td>3455.00</td>\n",
       "      <td>3261.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1000.00</td>\n",
       "      <td>1000.00</td>\n",
       "      <td>1000.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>2000.00</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>90000.00</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>34</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>29239.00</td>\n",
       "      <td>14027.00</td>\n",
       "      <td>13559.00</td>\n",
       "      <td>14331.00</td>\n",
       "      <td>14948.00</td>\n",
       "      <td>15549.00</td>\n",
       "      <td>1518.00</td>\n",
       "      <td>1500.00</td>\n",
       "      <td>1000.00</td>\n",
       "      <td>1000.00</td>\n",
       "      <td>1000.00</td>\n",
       "      <td>5000.00</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>50000.00</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>37</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>46990.00</td>\n",
       "      <td>48233.00</td>\n",
       "      <td>49291.00</td>\n",
       "      <td>28314.00</td>\n",
       "      <td>28959.00</td>\n",
       "      <td>29547.00</td>\n",
       "      <td>2000.00</td>\n",
       "      <td>2019.00</td>\n",
       "      <td>1200.00</td>\n",
       "      <td>1100.00</td>\n",
       "      <td>1069.00</td>\n",
       "      <td>1000.00</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>50000.00</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>57</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8617.00</td>\n",
       "      <td>5670.00</td>\n",
       "      <td>35835.00</td>\n",
       "      <td>20940.00</td>\n",
       "      <td>19146.00</td>\n",
       "      <td>19131.00</td>\n",
       "      <td>2000.00</td>\n",
       "      <td>36681.00</td>\n",
       "      <td>10000.00</td>\n",
       "      <td>9000.00</td>\n",
       "      <td>689.00</td>\n",
       "      <td>679.00</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ID  LIMIT_BAL  SEX  EDUCATION  MARRIAGE  AGE  PAY_0  PAY_2  PAY_3  PAY_4  \\\n",
       "0   1   20000.00    2          2         1   24      2      2     -1     -1   \n",
       "1   2  120000.00    2          2         2   26     -1      2      0      0   \n",
       "2   3   90000.00    2          2         2   34      0      0      0      0   \n",
       "3   4   50000.00    2          2         1   37      0      0      0      0   \n",
       "4   5   50000.00    1          2         1   57     -1      0     -1      0   \n",
       "\n",
       "   PAY_5  PAY_6  BILL_AMT1  BILL_AMT2  BILL_AMT3  BILL_AMT4  BILL_AMT5  \\\n",
       "0     -2     -2    3913.00    3102.00     689.00       0.00       0.00   \n",
       "1      0      2    2682.00    1725.00    2682.00    3272.00    3455.00   \n",
       "2      0      0   29239.00   14027.00   13559.00   14331.00   14948.00   \n",
       "3      0      0   46990.00   48233.00   49291.00   28314.00   28959.00   \n",
       "4      0      0    8617.00    5670.00   35835.00   20940.00   19146.00   \n",
       "\n",
       "   BILL_AMT6  PAY_AMT1  PAY_AMT2  PAY_AMT3  PAY_AMT4  PAY_AMT5  PAY_AMT6  \\\n",
       "0       0.00      0.00    689.00      0.00      0.00      0.00      0.00   \n",
       "1    3261.00      0.00   1000.00   1000.00   1000.00      0.00   2000.00   \n",
       "2   15549.00   1518.00   1500.00   1000.00   1000.00   1000.00   5000.00   \n",
       "3   29547.00   2000.00   2019.00   1200.00   1100.00   1069.00   1000.00   \n",
       "4   19131.00   2000.00  36681.00  10000.00   9000.00    689.00    679.00   \n",
       "\n",
       "   target  \n",
       "0       1  \n",
       "1       1  \n",
       "2       0  \n",
       "3       0  \n",
       "4       0  "
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from westat import *\n",
    "\n",
    "# westat 自带了 GiveMeSomeCredit 和 UCI_Credit_Card 两个数据集，可使用 GiveMeSomeCredit() 或 credit_card() 导入相关数据\n",
    "# data=GiveMeSomeCredit()\n",
    "# data_train = data.train\n",
    "# data_test = data.test\n",
    "\n",
    "# data=credit_card()\n",
    "\n",
    "\n",
    "data=credit_card()\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "55822107",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>LIMIT_BAL</th>\n",
       "      <th>SEX</th>\n",
       "      <th>EDUCATION</th>\n",
       "      <th>MARRIAGE</th>\n",
       "      <th>AGE</th>\n",
       "      <th>PAY_0</th>\n",
       "      <th>PAY_2</th>\n",
       "      <th>PAY_3</th>\n",
       "      <th>PAY_4</th>\n",
       "      <th>PAY_5</th>\n",
       "      <th>PAY_6</th>\n",
       "      <th>BILL_AMT1</th>\n",
       "      <th>BILL_AMT2</th>\n",
       "      <th>BILL_AMT3</th>\n",
       "      <th>BILL_AMT4</th>\n",
       "      <th>BILL_AMT5</th>\n",
       "      <th>BILL_AMT6</th>\n",
       "      <th>PAY_AMT1</th>\n",
       "      <th>PAY_AMT2</th>\n",
       "      <th>PAY_AMT3</th>\n",
       "      <th>PAY_AMT4</th>\n",
       "      <th>PAY_AMT5</th>\n",
       "      <th>PAY_AMT6</th>\n",
       "      <th>y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>20000.00</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>24</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>-1</td>\n",
       "      <td>-1</td>\n",
       "      <td>-2</td>\n",
       "      <td>-2</td>\n",
       "      <td>3913.00</td>\n",
       "      <td>3102.00</td>\n",
       "      <td>689.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>689.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>120000.00</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>26</td>\n",
       "      <td>-1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2682.00</td>\n",
       "      <td>1725.00</td>\n",
       "      <td>2682.00</td>\n",
       "      <td>3272.00</td>\n",
       "      <td>3455.00</td>\n",
       "      <td>3261.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1000.00</td>\n",
       "      <td>1000.00</td>\n",
       "      <td>1000.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>2000.00</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>90000.00</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>34</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>29239.00</td>\n",
       "      <td>14027.00</td>\n",
       "      <td>13559.00</td>\n",
       "      <td>14331.00</td>\n",
       "      <td>14948.00</td>\n",
       "      <td>15549.00</td>\n",
       "      <td>1518.00</td>\n",
       "      <td>1500.00</td>\n",
       "      <td>1000.00</td>\n",
       "      <td>1000.00</td>\n",
       "      <td>1000.00</td>\n",
       "      <td>5000.00</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>50000.00</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>37</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>46990.00</td>\n",
       "      <td>48233.00</td>\n",
       "      <td>49291.00</td>\n",
       "      <td>28314.00</td>\n",
       "      <td>28959.00</td>\n",
       "      <td>29547.00</td>\n",
       "      <td>2000.00</td>\n",
       "      <td>2019.00</td>\n",
       "      <td>1200.00</td>\n",
       "      <td>1100.00</td>\n",
       "      <td>1069.00</td>\n",
       "      <td>1000.00</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>50000.00</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>57</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8617.00</td>\n",
       "      <td>5670.00</td>\n",
       "      <td>35835.00</td>\n",
       "      <td>20940.00</td>\n",
       "      <td>19146.00</td>\n",
       "      <td>19131.00</td>\n",
       "      <td>2000.00</td>\n",
       "      <td>36681.00</td>\n",
       "      <td>10000.00</td>\n",
       "      <td>9000.00</td>\n",
       "      <td>689.00</td>\n",
       "      <td>679.00</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ID  LIMIT_BAL  SEX  EDUCATION  MARRIAGE  AGE  PAY_0  PAY_2  PAY_3  PAY_4  \\\n",
       "0   1   20000.00    2          2         1   24      2      2     -1     -1   \n",
       "1   2  120000.00    2          2         2   26     -1      2      0      0   \n",
       "2   3   90000.00    2          2         2   34      0      0      0      0   \n",
       "3   4   50000.00    2          2         1   37      0      0      0      0   \n",
       "4   5   50000.00    1          2         1   57     -1      0     -1      0   \n",
       "\n",
       "   PAY_5  PAY_6  BILL_AMT1  BILL_AMT2  BILL_AMT3  BILL_AMT4  BILL_AMT5  \\\n",
       "0     -2     -2    3913.00    3102.00     689.00       0.00       0.00   \n",
       "1      0      2    2682.00    1725.00    2682.00    3272.00    3455.00   \n",
       "2      0      0   29239.00   14027.00   13559.00   14331.00   14948.00   \n",
       "3      0      0   46990.00   48233.00   49291.00   28314.00   28959.00   \n",
       "4      0      0    8617.00    5670.00   35835.00   20940.00   19146.00   \n",
       "\n",
       "   BILL_AMT6  PAY_AMT1  PAY_AMT2  PAY_AMT3  PAY_AMT4  PAY_AMT5  PAY_AMT6  y  \n",
       "0       0.00      0.00    689.00      0.00      0.00      0.00      0.00  1  \n",
       "1    3261.00      0.00   1000.00   1000.00   1000.00      0.00   2000.00  1  \n",
       "2   15549.00   1518.00   1500.00   1000.00   1000.00   1000.00   5000.00  0  \n",
       "3   29547.00   2000.00   2019.00   1200.00   1100.00   1069.00   1000.00  0  \n",
       "4   19131.00   2000.00  36681.00  10000.00   9000.00    689.00    679.00  0  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 将目标变量重命名为“y”\n",
    "data.rename(columns={'target':'y'},inplace=True)\n",
    "data.head()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
